********************************************************************************
* worker_firm_panel.do
* Purpose: Build three datasets from the raw T4/ROE (employer-employee) records:
*   1. worker_firm_panel.dta  - Long panel of all worker x firm x year spells
*   2. moonlighter.dta        - Workers who held multiple jobs simultaneously
*   3. emp_panel.dta          - Firm x year employment count and average payroll
*
* Input : $data1/T4ROE_YYYY.dta (raw T4/ROE records, 2001-2017)
*         Variables used: entid_syn (firm ID), casenum2019 (worker ID),
*                         t4earn (T4 earnings), year
*
* These datasets are used downstream in:
*   - clean_firm.do   (emp_panel merged into firm-level data)
*   - clean_worker.do (worker_firm_panel merged for tenure)
*   - akm_estimation.do, match_effect_estimation.do
*   - matching_worker.do (moonlighter flag to exclude multi-job workers)
********************************************************************************

//------------------------------------------------------------------------------
// SECTION 1: WORKER-FIRM PANEL (worker_firm_panel.dta)
//
// Appends all T4ROE annual files to create a complete worker x firm x year
// panel. Only keeps records with positive earnings.
// Computes:
//   first_year_at_firm  - First year a worker appeared at a given firm
//   last_year_at_firm   - Last year a worker appeared at a given firm
//   max_gap             - Largest gap (in years) within the worker-firm spell
//                         (max_gap == 1 means continuous employment, no gaps)
//------------------------------------------------------------------------------
clear
forvalues y = 2001/2017 {

	append using $data1\T4ROE_`y', keep(entid_syn casenum2019 t4earn year) force

	* Drop observations with zero or missing earnings (not employed at this firm)
	drop if mi(t4earn) | t4earn == 0
}

* First and last year of employment for each worker-firm pair
gegen first_year_at_firm = min(year), by(casenum2019 entid_syn)
gegen last_year_at_firm  = max(year), by(casenum2019 entid_syn)

* Compute year-to-year gap within each worker-firm spell
gsort casenum2019 entid_syn year
by casenum2019 entid_syn: gen year_diff = year - year[_n-1]

* max_gap = 1 if the employment has been continuous (no gaps in employment)
* max_gap > 1 indicates at least one year of absence within the spell
gegen max_gap = max(year_diff), by(casenum2019 entid_syn)

compress
save $data/worker_firm_panel, replace


//------------------------------------------------------------------------------
// SECTION 2: MOONLIGHTING WORKERS (moonlighter.dta)
//
// Identifies workers who held more than one job for more than one year.
// A worker is a "moonlighter" if they appear at more than one firm in any year.
// Logic: moonlight counts the number of distinct jobs per worker-year;
//        workers with moonlight > 1 in any year are flagged.
// Output: a dataset of casenum2019 IDs for moonlighting workers only.
// Used to drop these workers from the matching sample in clean_worker.do.
//------------------------------------------------------------------------------
preserve
{

* stay_at_firm counts continuous employment years (gap == 1); larger gaps -> missing
gen 	stay_at_firm	= year_diff
replace	stay_at_firm	= . if year_diff > 1

* Sum valid employment records per worker-year (counts multiple jobs)
gegen	moonlight		= total(stay_at_firm), 	by(casenum2019 year)

* Flag workers who are moonlighters in ANY year across the panel
gegen 	moonlighter 	= max(moonlight), 		by(casenum2019)
replace	moonlighter 	= 0 if moonlighter <= 1
replace moonlighter 	= 1 if moonlighter > 1

* Keep only the moonlighter IDs (used as a merge-in flag in clean_worker.do)
keep if moonlighter == 1
duplicates drop casenum2019, force
keep casenum2019

compress
save $data/moonlighter, replace
}


//------------------------------------------------------------------------------
// SECTION 3: FIRM EMPLOYMENT PANEL (emp_panel.dta)
//
// Collapses the worker-firm panel to the firm x year level to compute:
//   total_wage    - Sum of all T4 earnings paid by the firm in year y
//   w_emp         - Number of T4 employment records (worker count)
//   avg_payrolls  - Average earnings per worker (total_wage / w_emp)
//
// This firm-level payroll measure is merged into firm_YYYY.dta in clean_firm.do
// and used as a matching variable.
//------------------------------------------------------------------------------
restore, preserve
{
collapse (sum) total_wage = t4earn (count) count = t4earn, by(entid_syn year)

rename (count) (w_emp)

* Average payroll per worker (using T4-based headcount)
gen avg_payrolls = total_wage/w_emp

compress
save $data/emp_panel, replace
}
